{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ch4.4  编程实现ID3算法\n",
    "试编程实现基于基尼指数进行划分选择的决策树算法，为表4.2中数据生成预剪枝、后剪枝决策树，并与未剪枝决策树进行比较。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "from ch4 import TreeGenerate, print_tree,TreeGenerate_preprune, TreeGenerate_postprune, print_rate"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 课后习题4.4 使用表4.2\n",
    "基于基尼指数的决策树"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "homework 4.4 using table 4.2\n",
      "\n",
      "\n",
      "texture=distinct \n",
      "\n",
      "    root=little_curl_up \n",
      "\n",
      "        color=black \n",
      "\n",
      "            touch=hard_smooth    1\n",
      "\n",
      "            touch=soft_stick    0\n",
      "\n",
      "        color=dark_green    1\n",
      "\n",
      "    root=stiff    0\n",
      "\n",
      "    root=curl_up    1\n",
      "\n",
      "texture=blur    0\n",
      "\n",
      "texture=little_blur \n",
      "\n",
      "    touch=hard_smooth    0\n",
      "\n",
      "    touch=soft_stick    10.7058823529411765\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.7058823529411765"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = pd.read_csv('data/table_4_2_watermelon_2_0.csv')\n",
    "dataset = dataset.drop('Idx',axis=1)\n",
    "nd = TreeGenerate(dataset, 1)\n",
    "print(\"homework 4.4 using table 4.2\")\n",
    "print_tree(nd, 0)\n",
    "print_rate(dataset, rn)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 将数据分为训练集和验证集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "dataset = pd.read_csv('data/table_4_2_watermelon_2_0.csv')\n",
    "dataset = dataset.drop('Idx',axis=1)\n",
    "train_dataset = dataset.ix[[0,1,2,5,6,9,13,14,15,16],:]\n",
    "val_dataset = dataset.ix[[3,4,7,8,10,11,12],:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 基于基尼指数的预剪枝决策树"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "color 0.428571428571 0.571428571429\n",
      "root 0.428571428571 0.428571428571\n",
      "texture 0.571428571429 0.0\n",
      "knocks 0.571428571429 0.571428571429\n",
      "\n",
      "\n",
      "color=black \n",
      "\n",
      "    root=little_curl_up    0\n",
      "\n",
      "    root=curl_up    1\n",
      "\n",
      "color=dark_green \n",
      "\n",
      "    knocks=clear    0\n",
      "\n",
      "    knocks=little_heavily    1\n",
      "\n",
      "    knocks=heavily    0\n",
      "\n",
      "color=light_white    0\n",
      "\n",
      "0.9\n",
      "0.42857142857142855\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.42857142857142855"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rn = TreeGenerate_preprune(train_dataset, val_dataset,1)\n",
    "print_tree(rn, 0)\n",
    "print(\"\\n\")\n",
    "print_rate(train_dataset, rn)\n",
    "print_rate(val_dataset, rn)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 基于基尼指数的后剪枝决策树"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.0\n",
      "0.5\n",
      "0.0\n",
      "0.5714285714285714\n",
      "\n",
      "\n",
      "color=black \n",
      "\n",
      "    root=little_curl_up    0\n",
      "\n",
      "    root=curl_up    1\n",
      "\n",
      "color=dark_green    1\n",
      "\n",
      "color=light_white    0\n",
      "\n",
      "0.7\n",
      "0.5714285714285714\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.5714285714285714"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nd = TreeGenerate(train_dataset, 1)\n",
    "rn = TreeGenerate_postprune(nd, val_dataset)\n",
    "print_tree(rn, 0)\n",
    "print(\"\\n\")\n",
    "print_rate(train_dataset, rn)\n",
    "print_rate(val_dataset, rn)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda env:anaconda]",
   "language": "python",
   "name": "conda-env-anaconda-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
